In this study_____

load libraries

library("tidyverse")
library("plyr")
library("dplyr")
library("ggplot2")
library("scales")
library("ggpubr")
library("gridExtra")
library("grid")
library("GGally")
library("vcfR") # for extracting genotype data from a vcf file
library("data.table")
library("stringr")
library("janitor")
library("gmodels")
library("rstatix")
library("freqtables")
library("broom")
library("DescTools") # for the Goodness-of-Fit test, 3 variables; and for Breslow-Day Test for Homogeneity of the Odds Ratios
library("vcd") # for the woolf test testign log odds for each pair
library("patchwork") # for gathering the plots
#library("fuzzyjoin") # to join tables based on a string in a column
#library("aspi") # Repeated G–tests of Goodness-of-Fit, work only for 2 variables..
#library("RVAideMemoire") # Repeated G–tests of Goodness-of-Fit, work only for 2 variables...
#library("InfoTrad")
#library("ggthemes") # for more colors in the ggplot
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE,
                     fig.width = 8,
                      fig.asp = 0.6,
                      out.width = "100%")
#fig.width = 6,fig.asp = 0.8,out.width = "100%"

Load Variant Call Format (VCF) file.

Extract genotypes for each site and individual

vcf <- read.vcfR("/Users/nuriteliash/Documents/GitHub/varroa-linkage-map/data/vcf_filter/Q40BIALLDP16HDP40mis.5Chr7/Q40BIALLDP16HDP40mis.5Chr7.recode.vcf", verbose = FALSE )
vcf
## ***** Object of Class vcfR *****
## 223 samples
## 7 CHROMs
## 35,169 variants
## Object size: 293.5 Mb
## 0 percent missing data
## *****        *****         *****
# extract the genotype for each site in each individual
gt <- extract.gt(vcf, element = "GT") 
gt <- as.data.frame(t(gt)) %>%
  rownames_to_column("ID")
#clean the ID names
gt$ID <- sub("_[^_]+$", "", gt$ID)

We keep only F1 and F2 generations, in which we have both adult male and female.
Eventually, we are left with 114 individuals of 26 families, and all 35,169 biallelic sites.


For example, these are the first 6 sites (13565-25513) of the first chromosome (NW_019211454.1), in one family (family number 240).

table <-  gt %>% 
  t() %>%
  as.data.frame() %>%
  row_to_names(row_number = 1) %>%
  dplyr::select(contains(c("son", "dat"))) # keep only adults of F1 and F2 

table %>% select(contains(c("240_"))) %>% head()
##                      240_240a_son 240_241c_grnson 240_241b_grndat
## NW_019211454.1_13565          0/1            <NA>            <NA>
## NW_019211454.1_18037          0/0            <NA>            <NA>
## NW_019211454.1_20998         <NA>            <NA>            <NA>
## NW_019211454.1_24935         <NA>             0/0            <NA>
## NW_019211454.1_25470         <NA>            <NA>            <NA>
## NW_019211454.1_25513         <NA>            <NA>            <NA>
##                      240_241a_grndat 240_241_dat
## NW_019211454.1_13565             0/1         0/1
## NW_019211454.1_18037             0/0         0/0
## NW_019211454.1_20998             0/0         0/0
## NW_019211454.1_24935             0/0         0/0
## NW_019211454.1_25470             0/0         0/0
## NW_019211454.1_25513             0/0        <NA>

The family include:

  • Brother and sister of F1 generation: adult male (240_240a_son) and adult females (240_241_dat).
  • Their offspring in the F2 generation: adult male (240_241c_grnson) and two adult females (240_241a_grndat and 240_241b_grndat).

Each individual, was genotyped for each site with one of the three genotypes:

  • Homozygote for the reference allele = 0/0
  • Heterozygote = 0/1
  • Homozygote for the alternate allele = 1/1
  • “NA” = site genotype not determined

In the following code we viewed the F2 offspring genotype of all possible nine crosses between F1 male and females.

  • Control crosses of homozygotic sites: The first four crosses are aiming mainly to detect false sites:
    1. F1 male (0/0) x female (0/0)
    2. F1 male (1/1) x female (1/1)
    3. F1 male (0/0) x female (1/1)
    4. F1 male (1/1) x female (0/0)

Then we cross heterozygotic sites, to explore the mode of genetic inheritance in varroa mite:

  • Homozygotic male, with heterozygotic female:
    1. F1 male (0/0) x female (0/1)
    2. F1 male (1/1) x female (0/1)
  • Heterozygotic male, with homozygotic female:
    1. F1 male (0/1) x female (0/0)
    2. F1 male (0/1) x female (1/1)
  • Heterozygotic male, with heterozygotic female:
    1. F1 male (0/1) x female (0/1)

For each crossing we have expected genotype-ratio of the F2, based on the following assumptions under arrhenotokous haplodiploid system:

  • Fertilized egg develops into diploid female ♀ (2n).
  • Unfertilized egg develops into haploid male ♂ (n).
  • No paternal inheritance in the males.

Control: Homozygotic sites (crosses 1-4)

To detect false sites

(1) F1 male (0/0) x female (0/0)

we expect all F2 offspring to be homozygotic (0/0) like their parents (F1)

x <- dat %>%
  dplyr::select(c("sample","gt","sex", "exp","prop", "total")) %>%
  mutate(type = "exp") %>%
  dplyr::rename(count = exp)

y <- dat %>%
  dplyr::select(c("sample","gt","sex", "obs","total")) %>%
  mutate(type = "obs") %>%
  dplyr::rename(count = obs) %>%
  mutate(prop = (count/total))
  
df_male <- rbind(x, y) %>%
  dplyr::filter(sex == "male")
      
df_fem <- rbind(x, y) %>%
  dplyr::filter(sex == "female")

p_male_pooled <- ggplot(df_male, aes(fill=gt, y=prop, x=type)) + 
    geom_bar(position="fill", stat="identity", ) +
    xlab("F2 males") + 
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 Males of F1 male (0/0) x female (0/0)", 
         subtitle = paste0("Pooled sites = ", df_male %>% dplyr::filter(gt == "0/0") %>% dplyr::filter(type == "obs") %>% summarise(sum(total)) %>% as.numeric())) + 
   scale_x_discrete("type", labels = c("Expected","Observed")) +
labs(fill = "Genotype") + 
  theme_classic() +
  theme(axis.text.x = element_text(size=15),
        axis.title.x = element_blank()) 

p_female_pooled <- ggplot(df_fem, aes(fill=gt, y=prop, x=type)) + 
    geom_bar(position="fill", stat="identity", ) +
    xlab("F2 females") + 
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 females of F1 male (0/0) x female (0/0)", 
         subtitle = paste0("Pooled sites = ", df_fem %>% dplyr::filter(gt == "0/0") %>% dplyr::filter(type == "obs") %>% summarise(sum(total)) %>% as.numeric())) + 
    scale_x_discrete("type", labels = c("Expected","Observed")) +
labs(fill = "Genotype") + 
  theme_classic() +
  theme(axis.text.x = element_text(size=15),
        axis.title.x = element_blank()) 

grid.arrange(p_female_pooled, p_male_pooled, ncol = 2)

# plot ratio for each individual: 
# males:
ggplot(df_male, aes(fill=gt, y=prop, x=type)) + 
    geom_bar(position="fill", stat="identity", ) +
    xlab("F2 siblings") + 
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 Males of F1 male (0/0) x female (0/0)") + 
   # geom_text(aes(label = n), position = position_stack(vjust = 0.5)) +
    geom_text(data = filter(df_male, gt == "0/0"), aes(y=prop, x=type, label=total), vjust = 0) +
    facet_wrap(~ sample) +
  labs(fill = "Genotype") + 
  theme_classic() +
  theme(axis.text.x = element_text(size=15),
        axis.title.x = element_blank()) 

# and females:
ggplot(df_fem, aes(fill=gt, y=prop, x=type)) + 
    geom_bar(position="fill", stat="identity", ) +
    xlab("F2 siblings") + 
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 Females of F1 male (0/0) x female (0/0)") + 
   # geom_text(aes(label = n), position = position_stack(vjust = 0.5)) +
    geom_text(data = filter(df_fem, gt == "0/0"), aes(y=prop, x=type, label=total), vjust = 0) +
    facet_wrap(~ sample) +
  labs(fill = "Genotype") + 
  theme_classic() +
  theme(axis.text.x = element_text(size=15),
        axis.title.x = element_blank()) 

  • make a vector with all sites that are not 0/0 in F2

(2) F1 male (1/1) x female (1/1)

we expect all F2 offspring to be homozygotic (1/1) like their parents (F1)

# set the families (include only families with at least one adult F2)
family = grep("grndat|grnson",gt$ID, value=TRUE) %>%
  str_extract("[^_]+")  %>%
  unique()

# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
  dplyr::filter_at(vars(matches("_son")), all_vars(. == "1/1")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "1/1")) %>% 
  dplyr::select(contains("grn")) %>%
  tidyr::pivot_longer(everything())  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  dplyr::count(sample, gt, .drop = FALSE) %>% 
  dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
  mutate(n = as.numeric(n)) %>%
  group_by(sample) %>%
  mutate(total = as.numeric(sum(n))) %>%
  dplyr::rename(obs = n) %>%
  mutate(sex = case_when(
    grepl("son", sample) ~ "male",
    grepl("dat", sample) ~ "female"))
}

# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs)

# state the expected site proportion of each genotype and sex under a perfect mendelian segregation of F2
fem_exp <- data.frame(sex = "female", gt = c("0/0", "0/1", "1/1") , prop = c(0,0,1))
male_exp <- data.frame(sex = "male",gt = c("0/0", "0/1", "1/1"),prop = c(0,0,1))
mendel <- bind_rows(fem_exp,male_exp) %>% mutate(sex = as.character(sex))%>% mutate(gt = as.character(gt)) 
 # mutate(type = "exp")

# using on the total genotypes count of sites per sample, calculate the expected counts, and add them to the observed counts into a final dat table
dat <- observed %>%
  #mutate(sample = as.character(sample)) %>%
  mutate(gt = as.character(gt)) %>%
  left_join(mendel, by = c("gt", "sex")) %>% 
  mutate(exp = total*prop)
x <- dat %>%
  dplyr::select(c("sample","gt","sex", "exp","prop", "total")) %>%
  mutate(type = "exp") %>%
  dplyr::rename(count = exp)

y <- dat %>%
  dplyr::select(c("sample","gt","sex", "obs","total")) %>%
  mutate(type = "obs") %>%
  dplyr::rename(count = obs) %>%
  mutate(prop = (count/total))
  
df_male <- rbind(x, y) %>%
  dplyr::filter(sex == "male")
      
df_fem <- rbind(x, y) %>%
  dplyr::filter(sex == "female")

p_male_pooled <- ggplot(df_male, aes(fill=gt, y=prop, x=type)) + 
    geom_bar(position="fill", stat="identity", ) +
    xlab("F2 males") + 
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 Males of F1 male (1/1) x female (1/1)", 
         subtitle = paste0("Pooled sites = ", df_male %>% dplyr::filter(gt == "0/0") %>% dplyr::filter(type == "obs") %>% summarise(sum(total)) %>% as.numeric())) + 
   scale_x_discrete("type", labels = c("Expected","Observed")) +
labs(fill = "Genotype") + 
  theme_classic() +
  theme(axis.text.x = element_text(size=15),
        axis.title.x = element_blank()) 

p_female_pooled <- ggplot(df_fem, aes(fill=gt, y=prop, x=type)) + 
    geom_bar(position="fill", stat="identity", ) +
    xlab("F2 females") + 
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 females of F1 male (1/1) x female (1/1)", 
         subtitle = paste0("Pooled sites = ", df_fem %>% dplyr::filter(gt == "0/0") %>% dplyr::filter(type == "obs") %>% summarise(sum(total)) %>% as.numeric())) + 
    scale_x_discrete("type", labels = c("Expected","Observed")) +
labs(fill = "Genotype") + 
  theme_classic() +
  theme(axis.text.x = element_text(size=15),
        axis.title.x = element_blank()) 

grid.arrange(p_female_pooled, p_male_pooled, ncol = 2)

ggplot(df_male, aes(fill=gt, y=prop, x=type)) + 
    geom_bar(position="fill", stat="identity", ) +
    xlab("F2 siblings") + 
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 Males of F1 male (1/1) x female (1/1)") + 
   # geom_text(aes(label = n), position = position_stack(vjust = 0.5)) +
    geom_text(data = filter(df_male, gt == "0/0"), aes(y=prop, x=type, label=total), vjust = 0) +
    facet_wrap(~ sample) +
  labs(fill = "Genotype") + 
  theme_classic() +
  theme(axis.text.x = element_text(size=15),
        axis.title.x = element_blank()) 

ggplot(df_fem, aes(fill=gt, y=prop, x=type)) + 
    geom_bar(position="fill", stat="identity", ) +
    xlab("F2 siblings") + 
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 Females of F1 male (1/1) x female (1/1)") + 
   # geom_text(aes(label = n), position = position_stack(vjust = 0.5)) +
    geom_text(data = filter(df_fem, gt == "0/0"), aes(y=prop, x=type, label=total), vjust = 0) +
    facet_wrap(~ sample) +
  labs(fill = "Genotype") + 
  theme_classic() +
  theme(axis.text.x = element_text(size=15),
        axis.title.x = element_blank()) 

  • make a vector with all sites that are not 1/1 in F2

(3) F1 male (0/0) x female (1/1)

We expect no sites for this cross, since there is not paternal inheritance to the males

# set the families (include only families with at least one adult F2)
family = grep("grndat|grnson",gt$ID, value=TRUE) %>%
  str_extract("[^_]+")  %>%
  unique()

# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
  dplyr::filter_at(vars(matches("_son")), all_vars(. == "0/0")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "1/1")) %>% 
  dplyr::select(contains("grn")) %>%
  tidyr::pivot_longer(everything())  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  dplyr::count(sample, gt, .drop = FALSE) %>% 
  dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
  mutate(n = as.numeric(n)) %>%
  group_by(sample) %>%
  mutate(total = as.numeric(sum(n))) %>%
  dplyr::rename(obs = n) %>%
  mutate(sex = case_when(
    grepl("son", sample) ~ "male",
    grepl("dat", sample) ~ "female"))
}

# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs)

# state the expected site proportion of each genotype and sex under a perfect mendelian segregation of F2
fem_exp <- data.frame(sex = "female", gt = c("0/0", "0/1", "1/1") , prop = c(0,0,0))
male_exp <- data.frame(sex = "male",gt = c("0/0", "0/1", "1/1"),prop = c(0,0,0))
mendel <- bind_rows(fem_exp,male_exp) %>% mutate(sex = as.character(sex))%>% mutate(gt = as.character(gt)) 
 # mutate(type = "exp")

# using on the total genotypes count of sites per sample, calculate the expected counts, and add them to the observed counts into a final dat table
dat <- observed %>%
  #mutate(sample = as.character(sample)) %>%
  mutate(gt = as.character(gt)) %>%
  left_join(mendel, by = c("gt", "sex")) %>% 
  mutate(exp = total*prop)
x <- dat %>%
  dplyr::select(c("sample","gt","sex", "exp","prop", "total")) %>%
  mutate(type = "exp") %>%
  dplyr::rename(count = exp)

y <- dat %>%
  dplyr::select(c("sample","gt","sex", "obs","total")) %>%
  mutate(type = "obs") %>%
  dplyr::rename(count = obs) %>%
  mutate(prop = (count/total))
  
df_male <- rbind(x, y) %>%
  dplyr::filter(sex == "male")
      
df_fem <- rbind(x, y) %>%
  dplyr::filter(sex == "female")

p_male_pooled <- ggplot(df_male, aes(fill=gt, y=prop, x=type)) + 
    geom_bar(position="fill", stat="identity", ) +
    xlab("F2 males") + 
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 Males of F1 male (0/0) x female (1/1)", 
         subtitle = paste0("Pooled sites = ", df_male %>% dplyr::filter(gt == "0/0") %>% dplyr::filter(type == "obs") %>% summarise(sum(total)) %>% as.numeric())) + 
   scale_x_discrete("type", labels = c("Expected","Observed")) +
labs(fill = "Genotype") + 
  theme_classic() +
  theme(axis.text.x = element_text(size=15),
        axis.title.x = element_blank()) 

p_female_pooled <- ggplot(df_fem, aes(fill=gt, y=prop, x=type)) + 
    geom_bar(position="fill", stat="identity", ) +
    xlab("F2 females") + 
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 females of F1 male (0/0) x female (1/1)", 
         subtitle = paste0("Pooled sites = ", df_fem %>% dplyr::filter(gt == "0/0") %>% dplyr::filter(type == "obs") %>% summarise(sum(total)) %>% as.numeric())) + 
    scale_x_discrete("type", labels = c("Expected","Observed")) +
labs(fill = "Genotype") + 
  theme_classic() +
  theme(axis.text.x = element_text(size=15),
        axis.title.x = element_blank()) 

grid.arrange(p_female_pooled, p_male_pooled, ncol = 2)

ggplot(df_male, aes(fill=gt, y=prop, x=type)) + 
    geom_bar(position="fill", stat="identity", ) +
    xlab("F2 siblings") + 
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 Males of F1 male (0/0) x female (1/1)") + 
   # geom_text(aes(label = n), position = position_stack(vjust = 0.5)) +
    geom_text(data = filter(df_male, gt == "0/0"), aes(y=prop, x=type, label=total), vjust = 0) +
    facet_wrap(~ sample) +
  labs(fill = "Genotype") + 
  theme_classic() +
  theme(axis.text.x = element_text(size=15),
        axis.title.x = element_blank()) 

ggplot(df_fem, aes(fill=gt, y=prop, x=type)) + 
    geom_bar(position="fill", stat="identity", ) +
    xlab("F2 siblings") + 
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 Females of F1 male (0/0) x female (1/1)") + 
   # geom_text(aes(label = n), position = position_stack(vjust = 0.5)) +
    geom_text(data = filter(df_fem, gt == "0/0"), aes(y=prop, x=type, label=total), vjust = 0) +
    facet_wrap(~ sample) +
  labs(fill = "Genotype") + 
  theme_classic() +
  theme(axis.text.x = element_text(size=15),
        axis.title.x = element_blank()) 

(4) F1 male (1/1) x female (0/0)

We expect no sites for this cross, sinse there is not paternal inheritance to the males

# set the families (include only families with at least one adult F2)
family = grep("grndat|grnson",gt$ID, value=TRUE) %>%
  str_extract("[^_]+")  %>%
  unique()

# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
  dplyr::filter_at(vars(matches("_son")), all_vars(. == "1/1")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "0/0")) %>% 
  dplyr::select(contains("grn")) %>%
  tidyr::pivot_longer(everything())  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  dplyr::count(sample, gt, .drop = FALSE) %>% 
  dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
  mutate(n = as.numeric(n)) %>%
  group_by(sample) %>%
  mutate(total = as.numeric(sum(n))) %>%
  dplyr::rename(obs = n) %>%
  mutate(sex = case_when(
    grepl("son", sample) ~ "male",
    grepl("dat", sample) ~ "female"))
}

# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs)

# state the expected site proportion of each genotype and sex under a perfect mendelian segregation of F2
fem_exp <- data.frame(sex = "female", gt = c("0/0", "0/1", "1/1") , prop = c(0,0,0))
male_exp <- data.frame(sex = "male",gt = c("0/0", "0/1", "1/1"),prop = c(0,0,0))
mendel <- bind_rows(fem_exp,male_exp) %>% mutate(sex = as.character(sex))%>% mutate(gt = as.character(gt)) 
 # mutate(type = "exp")

# using on the total genotypes count of sites per sample, calculate the expected counts, and add them to the observed counts into a final dat table
dat <- observed %>%
  #mutate(sample = as.character(sample)) %>%
  mutate(gt = as.character(gt)) %>%
  left_join(mendel, by = c("gt", "sex")) %>% 
  mutate(exp = total*prop)
x <- dat %>%
  dplyr::select(c("sample","gt","sex", "exp","prop", "total")) %>%
  mutate(type = "exp") %>%
  dplyr::rename(count = exp)

y <- dat %>%
  dplyr::select(c("sample","gt","sex", "obs","total")) %>%
  mutate(type = "obs") %>%
  dplyr::rename(count = obs) %>%
  mutate(prop = (count/total))
  
df_male <- rbind(x, y) %>%
  dplyr::filter(sex == "male")
      
df_fem <- rbind(x, y) %>%
  dplyr::filter(sex == "female")

p_male_pooled <- ggplot(df_male, aes(fill=gt, y=prop, x=type)) + 
    geom_bar(position="fill", stat="identity", ) +
    xlab("F2 males") + 
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 Males of F1 male (1/1) x female (0/0)", 
         subtitle = paste0("Pooled sites = ", df_male %>% dplyr::filter(gt == "0/0") %>% dplyr::filter(type == "obs") %>% summarise(sum(total)) %>% as.numeric())) + 
   scale_x_discrete("type", labels = c("Expected","Observed")) +
labs(fill = "Genotype") + 
  theme_classic() +
  theme(axis.text.x = element_text(size=15),
        axis.title.x = element_blank()) 

p_female_pooled <- ggplot(df_fem, aes(fill=gt, y=prop, x=type)) + 
    geom_bar(position="fill", stat="identity", ) +
    xlab("F2 females") + 
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 females of F1 male (1/1) x female (0/0)", 
         subtitle = paste0("Pooled sites = ", df_fem %>% dplyr::filter(gt == "0/0") %>% dplyr::filter(type == "obs") %>% summarise(sum(total)) %>% as.numeric()))+ 
   scale_x_discrete("type", labels = c("Expected","Observed")) +
labs(fill = "Genotype") + 
  theme_classic() +
  theme(axis.text.x = element_text(size=15),
        axis.title.x = element_blank()) 

grid.arrange(p_female_pooled, p_male_pooled, ncol = 2)

ggplot(df_male, aes(fill=gt, y=prop, x=type)) + 
    geom_bar(position="fill", stat="identity", ) +
    xlab("F2 siblings") + 
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 Males of F1 male (1/1) x female (0/0)") + 
   # geom_text(aes(label = n), position = position_stack(vjust = 0.5)) +
    geom_text(data = filter(df_male, gt == "0/0"), aes(y=prop, x=type, label=total), vjust = 0) +
    facet_wrap(~ sample) +
  labs(fill = "Genotype") + 
  theme_classic() +
  theme(axis.text.x = element_text(size=15),
        axis.title.x = element_blank()) 

ggplot(df_fem, aes(fill=gt, y=prop, x=type)) + 
    geom_bar(position="fill", stat="identity", ) +
    xlab("F2 siblings") + 
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 Females of F1 male (1/1) x female (0/0)") + 
   # geom_text(aes(label = n), position = position_stack(vjust = 0.5)) +
    geom_text(data = filter(df_fem, gt == "0/0"), aes(y=prop, x=type, label=total), vjust = 0) +
    facet_wrap(~ sample) +
  labs(fill = "Genotype") + 
  theme_classic() +
  theme(axis.text.x = element_text(size=15),
        axis.title.x = element_blank()) 

Crossing homozygotic male, with heterozygotic female

(5) F1 male (0/0) x female (0/1)

# set the families (include only families with at least one adult F2)
family = grep("grndat|grnson",gt$ID, value=TRUE) %>%
  str_extract("[^_]+")  %>%
  unique()

# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
  dplyr::filter_at(vars(matches("_son")), all_vars(. == "0/0")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "0/1")) %>% 
  dplyr::select(contains("grn")) %>%
  tidyr::pivot_longer(everything())  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  dplyr::count(sample, gt, .drop = FALSE) %>% 
  dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
  mutate(n = as.numeric(n)) %>%
  group_by(sample) %>%
  mutate(total = as.numeric(sum(n))) %>%
  dplyr::rename(obs = n) %>%
  mutate(sex = case_when(
    grepl("son", sample) ~ "male",
    grepl("dat", sample) ~ "female"))
}

# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs)

# state the expected site proportion of each genotype and sex under a perfect mendelian segregation of F2
fem_exp <- data.frame(sex = "female", gt = c("0/0", "0/1", "1/1") , prop = c(0.5, 0.5,0))
male_exp <- data.frame(sex = "male",gt = c("0/0", "0/1", "1/1"),prop = c(0.5, 0,0.5))
mendel <- bind_rows(fem_exp,male_exp) %>% mutate(sex = as.character(sex))%>% mutate(gt = as.character(gt)) 
 # mutate(type = "exp")

# using on the total genotypes count of sites per sample, calculate the expected counts, and add them to the observed counts into a final dat table
dat <- observed %>%
  #mutate(sample = as.character(sample)) %>%
  mutate(gt = as.character(gt)) %>%
  left_join(mendel, by = c("gt", "sex")) %>% 
  mutate(exp = total*prop)
x <- dat %>%
  dplyr::select(c("sample","gt","sex", "exp","prop", "total")) %>%
  mutate(type = "exp") %>%
  dplyr::rename(count = exp)

y <- dat %>%
  dplyr::select(c("sample","gt","sex", "obs","total")) %>%
  mutate(type = "obs") %>%
  dplyr::rename(count = obs) %>%
  mutate(prop = (count/total))
  
df_male <- rbind(x, y) %>%
  dplyr::filter(sex == "male")
      
df_fem <- rbind(x, y) %>%
  dplyr::filter(sex == "female")

p_male_pooled <- ggplot(df_male, aes(fill=gt, y=prop, x=type)) + 
    geom_bar(position="fill", stat="identity", ) +
    xlab("F2 males") + 
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 Males of F1 male (0/0) x female (0/1)", 
         subtitle = paste0("Pooled sites = ", df_male %>% dplyr::filter(gt == "0/0") %>% dplyr::filter(type == "obs") %>% summarise(sum(total)) %>% as.numeric())) + 
   scale_x_discrete("type", labels = c("Expected","Observed")) +
labs(fill = "Genotype") + 
  theme_classic() +
  theme(axis.text.x = element_text(size=15),
        axis.title.x = element_blank()) 

p_female_pooled <- ggplot(df_fem, aes(fill=gt, y=prop, x=type)) + 
    geom_bar(position="fill", stat="identity", ) +
    xlab("F2 females") + 
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 females of F1 male (0/0) x female (0/1)", 
         subtitle = paste0("Pooled sites = ", df_fem %>% dplyr::filter(gt == "0/0") %>% dplyr::filter(type == "obs") %>% summarise(sum(total)) %>% as.numeric())) + 
   scale_x_discrete("type", labels = c("Expected","Observed")) +
labs(fill = "Genotype") + 
  theme_classic() +
  theme(axis.text.x = element_text(size=15),
        axis.title.x = element_blank()) 

grid.arrange(p_female_pooled, p_male_pooled, ncol = 2)

ggplot(df_male, aes(fill=gt, y=prop, x=type)) + 
    geom_bar(position="fill", stat="identity", ) +
    xlab("F2 siblings") + 
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 Males of F1 male (0/0) x female (0/1)") + 
   # geom_text(aes(label = n), position = position_stack(vjust = 0.5)) +
    geom_text(data = filter(df_male, gt == "0/0"), aes(y=prop, x=type, label=total), vjust = 0) +
    facet_wrap(~ sample) +
  labs(fill = "Genotype") +
  theme_classic()

ggplot(df_fem, aes(fill=gt, y=prop, x=type)) + 
    geom_bar(position="fill", stat="identity", ) +
    xlab("F2 siblings") + 
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 Females of F1 male (0/0) x female (0/1)") + 
   # geom_text(aes(label = n), position = position_stack(vjust = 0.5)) +
    geom_text(data = filter(df_fem, gt == "0/0"), aes(y=prop, x=type, label=total), vjust = 0) +
    facet_wrap(~ sample) +
  labs(fill = "Genotype") +
  theme_classic()

(6) F1 male (1/1) x female (0/1)

# set the families (include only families with at least one adult F2)
family = grep("grndat|grnson",gt$ID, value=TRUE) %>%
  str_extract("[^_]+")  %>%
  unique()

# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
  dplyr::filter_at(vars(matches("_son")), all_vars(. == "1/1")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "0/1")) %>% 
  dplyr::select(contains("grn")) %>%
  tidyr::pivot_longer(everything())  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  dplyr::count(sample, gt, .drop = FALSE) %>% 
  dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
  mutate(n = as.numeric(n)) %>%
  group_by(sample) %>%
  mutate(total = as.numeric(sum(n))) %>%
  dplyr::rename(obs = n) %>%
  mutate(sex = case_when(
    grepl("son", sample) ~ "male",
    grepl("dat", sample) ~ "female"))
}

# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs)

# state the expected site proportion of each genotype and sex under a perfect mendelian segregation of F2
fem_exp <- data.frame(sex = "female", gt = c("0/0", "0/1", "1/1") , prop = c(0,0.5,1))
male_exp <- data.frame(sex = "male",gt = c("0/0", "0/1", "1/1"),prop = c(0.5,0,0.5))
mendel <- bind_rows(fem_exp,male_exp) %>% mutate(sex = as.character(sex))%>% mutate(gt = as.character(gt)) 
 # mutate(type = "exp")

# using on the total genotypes count of sites per sample, calculate the expected counts, and add them to the observed counts into a final dat table
dat <- observed %>%
  #mutate(sample = as.character(sample)) %>%
  mutate(gt = as.character(gt)) %>%
  left_join(mendel, by = c("gt", "sex")) %>% 
  mutate(exp = total*prop)
x <- dat %>%
  dplyr::select(c("sample","gt","sex", "exp","prop", "total")) %>%
  mutate(type = "exp") %>%
  dplyr::rename(count = exp)

y <- dat %>%
  dplyr::select(c("sample","gt","sex", "obs","total")) %>%
  mutate(type = "obs") %>%
  dplyr::rename(count = obs) %>%
  mutate(prop = (count/total))
  
df_male <- rbind(x, y) %>%
  dplyr::filter(sex == "male")
      
df_fem <- rbind(x, y) %>%
  dplyr::filter(sex == "female")

p_male_pooled <- ggplot(df_male, aes(fill=gt, y=prop, x=type)) + 
    geom_bar(position="fill", stat="identity", ) +
    xlab("F2 males") + 
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 Males of F1 male (1/1) x female (0/1)", 
         subtitle = paste0("Pooled sites = ", df_male %>% dplyr::filter(gt == "0/0") %>% dplyr::filter(type == "obs") %>% summarise(sum(total)) %>% as.numeric())) + 
   scale_x_discrete("type", labels = c("Expected","Observed")) +
labs(fill = "Genotype") + 
  theme_classic() +
  theme(axis.text.x = element_text(size=15),
        axis.title.x = element_blank()) 

p_female_pooled <- ggplot(df_fem, aes(fill=gt, y=prop, x=type)) + 
    geom_bar(position="fill", stat="identity", ) +
    xlab("F2 females") + 
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 females of F1 male (1/1) x female (0/1)", 
         subtitle = paste0("Pooled sites = ", df_fem %>% dplyr::filter(gt == "0/0") %>% dplyr::filter(type == "obs") %>% summarise(sum(total)) %>% as.numeric())) + 
   scale_x_discrete("type", labels = c("Expected","Observed")) +
labs(fill = "Genotype") + 
  theme_classic() +
  theme(axis.text.x = element_text(size=15),
        axis.title.x = element_blank()) 

grid.arrange(p_female_pooled, p_male_pooled, ncol = 2)

ggplot(df_male, aes(fill=gt, y=prop, x=type)) + 
    geom_bar(position="fill", stat="identity", ) +
    xlab("F2 siblings") + 
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 Males of F1 male (1/1) x female (0/1)") + 
   # geom_text(aes(label = n), position = position_stack(vjust = 0.5)) +
    geom_text(data = filter(df_male, gt == "0/0"), aes(y=prop, x=type, label=total), vjust = 0) +
    facet_wrap(~ sample) +
  labs(fill = "Genotype") +
  theme_classic()

ggplot(df_fem, aes(fill=gt, y=prop, x=type)) + 
    geom_bar(position="fill", stat="identity", ) +
    xlab("F2 siblings") + 
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 Females of F1 male (1/1) x female (0/1)") + 
   # geom_text(aes(label = n), position = position_stack(vjust = 0.5)) +
    geom_text(data = filter(df_fem, gt == "0/0"), aes(y=prop, x=type, label=total), vjust = 0) +
    facet_wrap(~ sample) +
  labs(fill = "Genotype") +
  theme_classic()

Crossing heterozygotic male, with homozygotic female

The former crosses of heterozygotic females (5 and 6) show that F2 males can be heterozygotic and carry two alleles, like their mother.
But are these sites real? and if they are, can they be transmitted to their daughters?
Looking at the F2 offspring of crossing heterozygotic F1 males with females, we tested 3 possible predictions:

  1. Heterozygotic sites are false: they are actually hemizygotic (0)
  2. Heterozygotic sites are false: they are actually hemizygotic (1)
  3. Heterozygotic sites are real: Males can carry and inherit two alleles (0/1)

(7) F1 male (0/1) x female (0/0)

# set the families (include only families with at least one adult F2)
family = grep("grndat|grnson",gt$ID, value=TRUE) %>%
  str_extract("[^_]+")  %>%
  unique()

# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
  dplyr::filter_at(vars(matches("_son")), all_vars(. == "0/1")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "0/0")) %>% 
  dplyr::select(contains("grn")) %>%
  tidyr::pivot_longer(everything())  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  dplyr::count(sample, gt, .drop = FALSE) %>% 
  dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
  mutate(n = as.numeric(n)) %>%
  group_by(sample) %>%
  mutate(total = as.numeric(sum(n))) %>%
  dplyr::rename(obs = n) %>%
  mutate(sex = case_when(
    grepl("son", sample) ~ "male",
    grepl("dat", sample) ~ "female"))
}

# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) 

# state the expected site proportion of each genotype and sex under the different hypothesis:
# Heterozygotic sites are real:Males can carry and inherit two alleles
fem_sitesReal <- data.frame(type = "sitesReal", sex = "female", gt = c("0/0", "0/1", "1/1") , prop = c(0.5, 0.5, 0))

# Heterozygotic sites are false:Males can carry and inherit only one allele
# the heterozygotic sites are actually (0)
fem_sitesFalse_0 <- data.frame(type = "sitesFalse_0",sex = "female", gt = c("0/0", "0/1", "1/1") , prop = c(1,0,0))
# the heterozygotic sites are actually (1)
fem_sitesFalse_1 <- data.frame(type = "sitesFalse_1", sex = "female", gt = c("0/0", "0/1", "1/1") , prop = c(0,1,0))

# males F2 genotype is expected to stay the same , as it is produced from unfertilized egg, hence have no paternal inheritance
male_sitesReal <- data.frame(type = "sitesReal",sex = "male",gt = c("0/0", "0/1", "1/1"),prop = c(1,0,0))
male_sitesFalse_0 <- data.frame(type = "sitesFalse_0",sex = "male",gt = c("0/0", "0/1", "1/1"),prop = c(1,0,0))
male_sitesFalse_1 <- data.frame(type = "sitesFalse_1", sex = "male",gt = c("0/0", "0/1", "1/1"),prop = c(1,0,0))

expected_prop <- bind_rows(fem_sitesReal,fem_sitesFalse_0,fem_sitesFalse_1,male_sitesReal,male_sitesFalse_0,male_sitesFalse_1) %>% mutate(sex = as.character(sex))%>% mutate(gt = as.character(gt)) 
 # mutate(type = "exp")

# using on the total genotypes count of sites per sample, calculate the expected counts, and add them to the observed counts into a final dat table
expected <- observed %>%
  mutate(gt = as.character(gt)) %>%
  full_join(expected_prop, by = c("gt", "sex")) %>% 
  mutate(count = total*prop)

x <- observed %>%
  dplyr::select(c("sample","sex","gt","total", "obs")) %>%
  mutate(type = "Observed") %>%
  dplyr::rename(count = obs) %>%
  mutate(prop = (count/total))

dat <-  bind_rows(x, expected) %>% 
  dplyr::select(-obs)
# subset for males and females, and reorder the "types" levels:
dat_fem <- dat %>% dplyr::filter(sex == "female")
level_order_fem <- factor(dat_fem$type, level = c("sitesFalse_0", "sitesFalse_1", "sitesReal", "Observed"))

dat_male <- dat %>% dplyr::filter(sex == "male")
level_order_male <- factor(dat_male$type, level = c("sitesFalse_0", "sitesFalse_1", "sitesReal", "Observed"))

p_male_pooled <- ggplot(dat_male, aes(fill=gt, y=prop, x=level_order_male)) + 
    geom_bar(position="fill", stat="identity", width = 0.8) +
    ylab("Proportion of F2 genotype") +
    labs(title = paste0("F2 Male, pooled sites = ", dat_male %>% dplyr::filter(type == "Observed") %>% summarise(sum(count)) %>% as.numeric()))  + 
  labs(fill = "Genotype:") + 
  theme_classic() +
  theme(axis.text.x = element_text(size=12, angle = 45,hjust = 1),
        axis.title.x = element_blank())

p_female_pooled <- ggplot(dat_fem, aes(fill=gt, y=prop, x=level_order_fem)) + 
    geom_bar(position="fill", stat="identity", width = 0.8) +
    ylab("Proportion of F2 genotype") +
    labs(title = paste0("F2 Female, pooled sites = ", dat_fem %>% dplyr::filter(type == "Observed") %>% summarise(sum(count)) %>% as.numeric())) + 
  labs(fill = "Genotype:") +
  theme_classic() +
  theme(axis.text.x = element_text(size=12, angle = 45,hjust = 1),
        axis.title.x = element_blank())

combined <- p_female_pooled + p_male_pooled & theme(legend.position = "bottom")
combined + plot_layout(guides = "collect") + plot_annotation(
  title = 'F2 genotype. F1 Cross: male (0/1) x female (0/0)')

# plot each individual:
ggplot(dat_male, aes(fill=gt, y=prop, x=level_order_male)) + 
    geom_bar(position="fill", stat="identity", ) +
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 male genotype. F1 Cross: male (0/1) x female (0/0)", fill = "Genotype:") +
    geom_text(data = dat_male %>% filter(gt == "0/1") %>% filter(type == "Observed"), aes(x = 4, y = 0.8, label = total), inherit.aes = FALSE) +
    facet_wrap(~ sample) +
    theme_classic()+ 
    theme(legend.position = "bottom", axis.title.x = element_blank(), axis.text.x = element_text(size=8, angle = 45,hjust = 1)) 

ggplot(dat_fem, aes(fill=gt, y=prop, x=level_order_fem)) + 
    geom_bar(position="fill", stat="identity", ) +
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 female genotype. F1 Cross: male (0/1) x female (0/0)", fill = "Genotype:") + 
    geom_text(data = dat_fem %>% filter(gt == "0/1") %>% filter(type == "Observed"), aes(x = 4, y = 0.8, label = total), inherit.aes = FALSE) +
    facet_wrap(~ sample) +
    theme_classic()+ 
    theme(legend.position = "bottom", axis.title.x = element_blank(), axis.text.x = element_text(size=8, angle = 45,hjust = 1)) 

(8) F1 male (0/1) x female (1/1)

# set the families (include only families with at least one adult F2)
family = grep("grndat|grnson",gt$ID, value=TRUE) %>%
  str_extract("[^_]+")  %>%
  unique()

# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
  dplyr::filter_at(vars(matches("_son")), all_vars(. == "0/1")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "1/1")) %>% 
  dplyr::select(contains("grn")) %>%
  tidyr::pivot_longer(everything())  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  dplyr::count(sample, gt, .drop = FALSE) %>% 
  dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
  mutate(n = as.numeric(n)) %>%
  group_by(sample) %>%
  mutate(total = as.numeric(sum(n))) %>%
  dplyr::rename(obs = n) %>%
  mutate(sex = case_when(
    grepl("son", sample) ~ "male",
    grepl("dat", sample) ~ "female"))
}

# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) 

# state the expected site proportion of each genotype and sex under the different hypothesis:
# Heterozygotic sites are real:Males can carry and inherit two alleles
fem_sitesReal <- data.frame(type = "sitesReal", sex = "female", gt = c("0/0", "0/1", "1/1") , prop = c(0,0.5,0.5))

# Heterozygotic sites are false:Males can carry and inherit only one allele
# the heterozygotic sites are actually (0)
fem_sitesFalse_0 <- data.frame(type = "sitesFalse_0",sex = "female", gt = c("0/0", "0/1", "1/1") , prop = c(0,1,0))
# the heterozygotic sites are actually (1)
fem_sitesFalse_1 <- data.frame(type = "sitesFalse_1", sex = "female", gt = c("0/0", "0/1", "1/1") , prop = c(0,0,1))

# males F2 genotype is expected to stay the same , as it is produced from unfertilized egg, hence have no paternal inheritance
male_sitesReal <- data.frame(type = "sitesReal",sex = "male",gt = c("0/0", "0/1", "1/1"),prop = c(0,0,1))
male_sitesFalse_0 <- data.frame(type = "sitesFalse_0",sex = "male",gt = c("0/0", "0/1", "1/1"),prop = c(0,0,1))
male_sitesFalse_1 <- data.frame(type = "sitesFalse_1", sex = "male",gt = c("0/0", "0/1", "1/1"),prop = c(0,0,1))

expected_prop <- bind_rows(fem_sitesReal,fem_sitesFalse_0,fem_sitesFalse_1,male_sitesReal,male_sitesFalse_0,male_sitesFalse_1) %>% mutate(sex = as.character(sex))%>% mutate(gt = as.character(gt)) 
 # mutate(type = "exp")

# using on the total genotypes count of sites per sample, calculate the expected counts, and add them to the observed counts into a final dat table
expected <- observed %>%
  mutate(gt = as.character(gt)) %>%
  full_join(expected_prop, by = c("gt", "sex")) %>% 
  mutate(count = total*prop)

x <- observed %>%
  dplyr::select(c("sample","sex","gt","total", "obs")) %>%
  mutate(type = "Observed") %>%
  dplyr::rename(count = obs) %>%
  mutate(prop = (count/total))

dat <-  bind_rows(x, expected) %>% 
  dplyr::select(-obs)
# subset for males and females, and reorder the "types" levels:
dat_fem <- dat %>% dplyr::filter(sex == "female")
level_order_fem <- factor(dat_fem$type, level = c("sitesFalse_0", "sitesFalse_1", "sitesReal", "Observed"))

dat_male <- dat %>% dplyr::filter(sex == "male")
level_order_male <- factor(dat_male$type, level = c("sitesFalse_0", "sitesFalse_1", "sitesReal", "Observed"))

p_male_pooled <- ggplot(dat_male, aes(fill=gt, y=prop, x=level_order_male)) + 
    geom_bar(position="fill", stat="identity", width = 0.8) +
    ylab("Proportion of F2 genotype") +
    labs(title = paste0("F2 Male, pooled sites = ", dat_male %>% dplyr::filter(type == "Observed") %>% summarise(sum(count)) %>% as.numeric()))  + 
  labs(fill = "Genotype:") + 
  theme_classic() +
  theme(axis.text.x = element_text(size=12, angle = 45,hjust = 1),
        axis.title.x = element_blank())

p_female_pooled <- ggplot(dat_fem, aes(fill=gt, y=prop, x=level_order_fem)) + 
    geom_bar(position="fill", stat="identity", width = 0.8) +
    ylab("Proportion of F2 genotype") +
    labs(title = paste0("F2 Female, pooled sites = ", dat_fem %>% dplyr::filter(type == "Observed") %>% summarise(sum(count)) %>% as.numeric())) + 
  labs(fill = "Genotype:") +
  theme_classic() +
  theme(axis.text.x = element_text(size=12, angle = 45,hjust = 1),
        axis.title.x = element_blank())

combined <- p_female_pooled + p_male_pooled & theme(legend.position = "bottom")
combined + plot_layout(guides = "collect") + plot_annotation(
  title = 'F2 genotype. F1 Cross: male (0/1) x female (1/1)')

# plot each individual:
ggplot(dat_male, aes(fill=gt, y=prop, x=level_order_male)) + 
    geom_bar(position="fill", stat="identity", ) +
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 male genotype. F1 Cross: male (0/1) x female (1/1)", fill = "Genotype:") +
    geom_text(data = dat_male %>% filter(gt == "0/1") %>% filter(type == "Observed"), aes(x = 4, y = 0.8, label = total), inherit.aes = FALSE) +
    facet_wrap(~ sample) +
    theme_classic()+ 
    theme(legend.position = "bottom", axis.title.x = element_blank(), axis.text.x = element_text(size=8, angle = 45,hjust = 1)) 

ggplot(dat_fem, aes(fill=gt, y=prop, x=level_order_fem)) + 
    geom_bar(position="fill", stat="identity", ) +
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 female genotype. F1 Cross: male (0/1) x female (1/1)", fill = "Genotype:") + 
    geom_text(data = dat_fem %>% filter(gt == "0/1") %>% filter(type == "Observed"), aes(x = 4, y = 0.8, label = total), inherit.aes = FALSE) +
    facet_wrap(~ sample) +
    theme_classic()+ 
    theme(legend.position = "bottom", axis.title.x = element_blank(), axis.text.x = element_text(size=8, angle = 45,hjust = 1)) 

(9) F1 male (0/1) x female (0/1)

# set the families (include only families with at least one adult F2)
family = grep("grndat|grnson",gt$ID, value=TRUE) %>%
  str_extract("[^_]+")  %>%
  unique()

# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
  dplyr::filter_at(vars(matches("_son")), all_vars(. == "0/1")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "0/1")) %>% 
  dplyr::select(contains("grn")) %>%
  tidyr::pivot_longer(everything())  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  dplyr::count(sample, gt, .drop = FALSE) %>% 
  dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
  mutate(n = as.numeric(n)) %>%
  group_by(sample) %>%
  mutate(total = as.numeric(sum(n))) %>%
  dplyr::rename(obs = n) %>%
  mutate(sex = case_when(
    grepl("son", sample) ~ "male",
    grepl("dat", sample) ~ "female"))
}

# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) 

# state the expected site proportion of each genotype and sex under the different hypothesis:
# Heterozygotic sites are real:Males can carry and inherit two alleles
fem_sitesReal <- data.frame(type = "sitesReal", sex = "female", gt = c("0/0", "0/1", "1/1") , prop = c(0.25, 0.5, 0.25))

# Heterozygotic sites are false:Males can carry and inherit only one allele
# the heterozygotic sites are actually (0)
fem_sitesFalse_0 <- data.frame(type = "sitesFalse_0",sex = "female", gt = c("0/0", "0/1", "1/1") , prop = c(0.5, 0.5, 0))
# the heterozygotic sites are actually (1)
fem_sitesFalse_1 <- data.frame(type = "sitesFalse_1", sex = "female", gt = c("0/0", "0/1", "1/1") , prop = c(0, 0.5, 0.5))

# males F2 genotype is expected to stay the same , as it is produced from unfertilized egg, hence have no paternal inheritance
male_sitesReal <- data.frame(type = "sitesReal",sex = "male",gt = c("0/0", "0/1", "1/1"),prop = c(0.5, 0, 0.5))
male_sitesFalse_0 <- data.frame(type = "sitesFalse_0",sex = "male",gt = c("0/0", "0/1", "1/1"),prop = c(0.5, 0, 0.5))
male_sitesFalse_1 <- data.frame(type = "sitesFalse_1", sex = "male",gt = c("0/0", "0/1", "1/1"),prop = c(0.5, 0, 0.5))

expected_prop <- bind_rows(fem_sitesReal,fem_sitesFalse_0,fem_sitesFalse_1,male_sitesReal,male_sitesFalse_0,male_sitesFalse_1) %>% mutate(sex = as.character(sex))%>% mutate(gt = as.character(gt)) 
 # mutate(type = "exp")

# using on the total genotypes count of sites per sample, calculate the expected counts, and add them to the observed counts into a final dat table
expected <- observed %>%
  mutate(gt = as.character(gt)) %>%
  full_join(expected_prop, by = c("gt", "sex")) %>% 
  mutate(count = total*prop)

x <- observed %>%
  dplyr::select(c("sample","sex","gt","total", "obs")) %>%
  mutate(type = "Observed") %>%
  dplyr::rename(count = obs) %>%
  mutate(prop = (count/total))

dat <-  bind_rows(x, expected) %>% 
  dplyr::select(-obs)
# subset for males and females, and reorder the "types" levels:
dat_fem <- dat %>% dplyr::filter(sex == "female")
level_order_fem <- factor(dat_fem$type, level = c("sitesFalse_0", "sitesFalse_1", "sitesReal", "Observed"))

dat_male <- dat %>% dplyr::filter(sex == "male")
level_order_male <- factor(dat_male$type, level = c("sitesFalse_0", "sitesFalse_1", "sitesReal", "Observed"))

p_male_pooled <- ggplot(dat_male, aes(fill=gt, y=prop, x=level_order_male)) + 
    geom_bar(position="fill", stat="identity", width = 0.8) +
    ylab("Proportion of F2 genotype") +
    labs(title = paste0("F2 Male, pooled sites = ", dat_male %>% dplyr::filter(type == "Observed") %>% summarise(sum(count)) %>% as.numeric()))  + 
  labs(fill = "Genotype:") + 
  theme_classic() +
  theme(axis.text.x = element_text(size=12, angle = 45,hjust = 1),
        axis.title.x = element_blank())

p_female_pooled <- ggplot(dat_fem, aes(fill=gt, y=prop, x=level_order_fem)) + 
    geom_bar(position="fill", stat="identity", width = 0.8) +
    ylab("Proportion of F2 genotype") +
    labs(title = paste0("F2 Female, pooled sites = ", dat_fem %>% dplyr::filter(type == "Observed") %>% summarise(sum(count)) %>% as.numeric())) + 
  labs(fill = "Genotype:") +
  theme_classic() +
  theme(axis.text.x = element_text(size=12, angle = 45,hjust = 1),
        axis.title.x = element_blank())

combined <- p_female_pooled + p_male_pooled & theme(legend.position = "bottom")
combined + plot_layout(guides = "collect") + plot_annotation(
  title = 'F2 genotype. F1 Cross: male (0/1) x female (0/1)')

# plot each individual:
ggplot(dat_male, aes(fill=gt, y=prop, x=level_order_male)) + 
    geom_bar(position="fill", stat="identity", ) +
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 male genotype. F1 Cross: male (0/1) x female (0/1)", fill = "Genotype:") +
    geom_text(data = dat_male %>% filter(gt == "0/1") %>% filter(type == "Observed"), aes(x = 4, y = 0.8, label = total), inherit.aes = FALSE) +
    facet_wrap(~ sample) +
    theme_classic()+ 
    theme(legend.position = "bottom", axis.title.x = element_blank(), axis.text.x = element_text(size=8, angle = 45,hjust = 1)) 

ggplot(dat_fem, aes(fill=gt, y=prop, x=level_order_fem)) + 
    geom_bar(position="fill", stat="identity", ) +
    ylab("Proportion of F2 genotype") +
    labs(title = "F2 female genotype. F1 Cross: male (0/1) x female (0/1)", fill = "Genotype:") + 
    geom_text(data = dat_fem %>% filter(gt == "0/1") %>% filter(type == "Observed"), aes(x = 4, y = 0.8, label = total), inherit.aes = FALSE) +
    facet_wrap(~ sample) +
    theme_classic()+ 
    theme(legend.position = "bottom", axis.title.x = element_blank(), axis.text.x = element_text(size=8, angle = 45,hjust = 1)) 

Statistics

We did 2 types of a-parametric tests for independence, testing the predicted counts against the observed counts:

  • Cochran–Mantel–Haenszel Test (CMH) for Repeated Tests of Independence
  • Replicated G-test of goodness of fit

Cochran–Mantel–Haenszel Test for Repeated Tests of Independence

CMH Test for females

CMH Test for males

Replicated G-test of goodness of fit